Heloderma suspectum, the Gila Monster

1a: Explore

Gather species data for Heloderma suspectum

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- FALSE

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Heloderma suspectum', 
    from = 'gbif', has_coords = T,
    limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) %>% # save space (joinable from obs_csv)
    subset(!key %in% c(861689291, 1050659944, 1145541957, 686703568)) 
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 1258
mapview::mapview(obs, map.types = "OpenTopoMap")

Cleaning steps

Several observations were rerecorded with a lat/long of (0,0) as indicated by the dots off the coast of Africa (visible if the above code is run without line 58). Additionally, an investigation of Gila Monster habitat on Google shows that the points in the Midwest and California coast are far out of habitable range. Sources disagree on whether or not the true range extends into southern Mexico, so for the purposes of this analysis those observations will remain in the dataset.

Total observations

After removing these four bad observations, we are left with 1258 observations of Heloderma suspectum from the GBIF database.

Environmental layer rationale

I used the same default layers presented in the lab template: WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet.

I concluded that these would be appropriate metrics because1:

  • Annual mean temp
  • Mean diurnal temp range
  • Topographic wetness

Are all different ways of measuring climactic variables that are associated with deserts, the Gila Monster’s native habitat. Including all three should allow us to identify which features of the desert might be the most important in identifying suitable habitat.

  • Altitude
  • Terrain roughness

Both measure adjacent metrics that could help narrow down the range: for example, as slow-moving reptiles they may prefer a sweet spot of terrain roughness that isn’t too hard for them to navigate but also provides enough nooks and crannies to hide in. Altitude is somewhat of an intermediary between terrain and temperature variables, with lower elevations often flatter in the Southwest and also warmer.

Environmental Data

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
  list(obs, obs_hull), map.types = "OpenTopoMap")
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Create & Combine Pseudo-Absences

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence (ie obs) and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

1b: Regress

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

Logistic Regressions

Linear Model

d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 2508
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1038 -0.3699  0.1417  0.3169  0.9387 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  1.79779362  0.86250731   2.084               0.0372 *  
## WC_alt       0.00007040  0.00005537   1.272               0.2037    
## WC_bio1      0.05888077  0.00935076   6.297       0.000000000358 ***
## WC_bio2     -0.04023579  0.00642192  -6.265       0.000000000437 ***
## ER_tri      -0.00078104  0.00051445  -1.518               0.1291    
## ER_topoWet  -0.03075911  0.01279483  -2.404               0.0163 *  
## lon          0.04101600  0.00758717   5.406       0.000000070564 ***
## lat          0.10156525  0.00666823  15.231 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4077 on 2500 degrees of freedom
## Multiple R-squared:  0.3373, Adjusted R-squared:  0.3354 
## F-statistic: 181.8 on 7 and 2500 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.4040449  1.1093685

GLM

# fit a generalized linear model with a binomial logit link function
mdl_logit <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl_logit)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.70006  -0.77577  -0.05915   0.81018   2.50613  
## 
## Coefficients:
##               Estimate Std. Error z value             Pr(>|z|)    
## (Intercept) 12.9185927  6.0093620   2.150               0.0316 *  
## WC_alt      -0.0001416  0.0003924  -0.361               0.7182    
## WC_bio1      0.3326938  0.0604448   5.504       0.000000037108 ***
## WC_bio2     -0.1743633  0.0396090  -4.402       0.000010719951 ***
## ER_tri      -0.0019532  0.0033282  -0.587               0.5573    
## ER_topoWet  -0.2045338  0.0804176  -2.543               0.0110 *  
## lon          0.3160960  0.0516575   6.119       0.000000000941 ***
## lat          0.6855362  0.0464695  14.752 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3476.8  on 2507  degrees of freedom
## Residual deviance: 2412.1  on 2500  degrees of freedom
## AIC: 2428.1
## 
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl_logit, d, type="response")

range(y_predict)
## [1] 0.001602979 0.975640706
# show term plots
termplot(mdl_logit, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

GAM

mdl_add <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl_add)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.2921     0.2985  -4.329 0.000015 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq              p-value    
## s(WC_alt)     5.769  6.853  29.573             0.000132 ***
## s(WC_bio1)    7.112  8.133  45.708 < 0.0000000000000002 ***
## s(WC_bio2)    1.513  1.884  15.518             0.001884 ** 
## s(ER_tri)     2.709  3.500   9.752             0.026552 *  
## s(ER_topoWet) 8.138  8.761  33.556            0.0000792 ***
## s(lon)        6.492  7.595 157.875 < 0.0000000000000002 ***
## s(lat)        8.478  8.847 180.041 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.645   Deviance explained = 59.4%
## UBRE = -0.40395  Scale est. = 1         n = 2508
# show term plots
plot(mdl_add, scale=0)

GAM presence contribution

Most environmental variables hover around zero, jump around, and/or have large error margins. Interestingly, WC_bio2 (mean diurnal temperature range) is the most consistent as both a predictor of presence and absence: at low levels it is predictive of presence, but at high levels is predictive of absence. Longitude is largely a negative predictor, but with exceptionally low confidence at higher values.

Maximum Entropy

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl_mx <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl_mx, mdl_maxent_rds)
}
mdl_mx <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl_mx)

# plot term plots
response(mdl_mx)

# predict
y_predict <- predict(env_stack, mdl_mx) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Maxent presence contribution

Low values of WC_alt is a strong positive predictor, as is low WC_bio1. Likewise, high values of both those variables are strong negative predictors. Interestingly, Maxent identifies the opposite effect of WC_bio2 as the GAM did: here, high WC_bio2 is a positive predictor.

1c: Trees

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA

Decision Trees

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1 in each df
table(d$present)
## 
##    0    1 
## 1256 1252
table(d_train$present)
## 
##    0    1 
## 1004 1001
# simplest tree with maxdepth = 1
mdl1 <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl1
## n= 2005 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 2005 1001 0 (0.50074813 0.49925187)  
##   2) lon>=-108.7087 645    7 0 (0.98914729 0.01085271) *
##   3) lon< -108.7087 1360  366 1 (0.26911765 0.73088235) *
# plot tree (depth=1)
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl1)

# decision tree with default settings
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 2005 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 2005 1001 0 (0.50074813 0.49925187)  
##     2) lon>=-108.7087 645    7 0 (0.98914729 0.01085271) *
##     3) lon< -108.7087 1360  366 1 (0.26911765 0.73088235)  
##       6) lon< -112.1894 323  145 0 (0.55108359 0.44891641)  
##        12) lon>=-112.3996 21    0 0 (1.00000000 0.00000000) *
##        13) lon< -112.3996 302  145 0 (0.51986755 0.48013245)  
##          26) ER_topoWet< 12.9 270  118 0 (0.56296296 0.43703704)  
##            52) lat< 36.9833 247   98 0 (0.60323887 0.39676113)  
##             104) lat>=32.3698 204   70 0 (0.65686275 0.34313725) *
##             105) lat< 32.3698 43   15 1 (0.34883721 0.65116279) *
##            53) lat>=36.9833 23    3 1 (0.13043478 0.86956522) *
##          27) ER_topoWet>=12.9 32    5 1 (0.15625000 0.84375000) *
##       7) lon>=-112.1894 1037  188 1 (0.18129219 0.81870781)  
##        14) lon>=-110.5494 274  114 1 (0.41605839 0.58394161)  
##          28) lat< 31.20907 96   27 0 (0.71875000 0.28125000) *
##          29) lat>=31.20907 178   45 1 (0.25280899 0.74719101)  
##            58) lat>=33.20688 25    2 0 (0.92000000 0.08000000) *
##            59) lat< 33.20688 153   22 1 (0.14379085 0.85620915) *
##        15) lon< -110.5494 763   74 1 (0.09698558 0.90301442)  
##          30) lat>=34.41848 16    0 0 (1.00000000 0.00000000) *
##          31) lat< 34.41848 747   58 1 (0.07764391 0.92235609) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.62737263      0 1.0000000 1.0879121 0.02228283
## 2 0.03296703      1 0.3726274 0.3736264 0.01742495
## 3 0.02097902      2 0.3396603 0.3526474 0.01703731
## 4 0.01598402      5 0.2767233 0.2897103 0.01573402
## 5 0.01098901      6 0.2607393 0.2447552 0.01465035
## 6 0.01000000     10 0.2087912 0.2287712 0.01422813
# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

Complexity plot threshold

According to the R Documantation for plotcp(), “A good choice of cp for pruning is often the leftmost value for which the mean lies below the horizontal line.” With this in mind, a tree with 11 branches is recommended.

3 most important variables

The variable importance chart above lists lon, lat, and WC_alt as the most important variables in determining presence/absence.

However, the nodes of the tree itself show a slightly different story: the entire tree is made up on splits of latitude and longitude, with the exception of a single node based on ER_topoWet. So while the importance of lat/long is clear, no decisions in the tree are made based on WC_alt, and ER_topoWet does show up in the tree despite being ranked as very low importance.

Random Forests

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2928917
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")

p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

Variable importance in rpart vs Random Forest

Because Random Forest uses many trees combined to come up with the best overall solution, it may not line up with the single decision tree created above. This can be seen slightly differently in each of the variable importance charts: the permutation-based importance is in mostly the same order as the individual decision tree (only lat/lon are switched), while the impurity-based importance varies with ER_topoWet gaining importance over WC_bio2.

1d: Evaluate

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

Training/Testing Data Split

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
##    Variables      VIF
## 1     WC_alt 3.018233
## 2    WC_bio1 2.880445
## 3    WC_bio2 1.319170
## 4     ER_tri 4.814898
## 5 ER_topoWet 5.043996
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 2 variables from the 5 input variables have collinearity problem: 
##  
## ER_topoWet WC_alt 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( ER_tri ~ WC_bio1 ):  -0.01596287 
## max correlation ( WC_bio2 ~ WC_bio1 ):  -0.3281011 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1   WC_bio1 1.140378
## 2   WC_bio2 1.262452
## 3    ER_tri 1.126835
# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

mdl_maxv_rds  <- file.path(dir_data, "mdl_maxv.rds")

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
# mdl_maxv <- read_rds(mdl_maxv.rds)
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
## This is MaxEnt version 3.4.3
# plot variable contributions per predictor
plot(mdl_maxv)

# plot term plots
dismo::response(mdl_maxv)

# note: JVM was messed up so had to run `rJava::.jpackage("dismo")` to be able to run it

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

#### Variables removed due to multicolinearity; rank of remaining variables

ER_topoWet and WC_alt were removed due to multicolinearity. Of the remaining 3 variables, WC_bio2 is the most important, followed by WC_bio1, and finally ER_tri is the least important.

Model Performance

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

# y_maxv <- predict(mdl_maxv, env_stack)
# plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 250 
## n absences     : 251 
## AUC            : 0.6708526 
## cor            : 0.3210313 
## max TPR+TNR at : 0.6370805
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
print(paste(thr, "is the ROC threshold value"))
## [1] "0.63708045 is the ROC threshold value"
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs        0.744   0.4541833
## absent_obs         0.256   0.5458167
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")

plot(y_maxv > thr, main='Maxent binary habitat')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')


  1. Rationale are based off my own intuition and general familiarity with the species, not an actual literature search.↩︎